In [1]:
%matplotlib inline
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import iqr
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
/Users/clodaya/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [2]:
data = pd.read_csv("vehicle.csv")
print('\n\nshape of data:', data.shape)
print('\n\nCount of target variable:\n\n', data['class'].value_counts(),'\n\nData types of attributes:\n\n')
print(data.info())
print('\n\n First five rows of data:\n\n', data.head())
print('\n\n Few statistical values of attributes:\n\n', data.describe())

shape of data: (846, 19)


Count of target variable:

 car    429
bus    218
van    199
Name: class, dtype: int64 

Data types of attributes:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   compactness                  846 non-null    int64  
 1   circularity                  841 non-null    float64
 2   distance_circularity         842 non-null    float64
 3   radius_ratio                 840 non-null    float64
 4   pr.axis_aspect_ratio         844 non-null    float64
 5   max.length_aspect_ratio      846 non-null    int64  
 6   scatter_ratio                845 non-null    float64
 7   elongatedness                845 non-null    float64
 8   pr.axis_rectangularity       843 non-null    float64
 9   max.length_rectangularity    846 non-null    int64  
 10  scaled_variance              843 non-null    float64
 11  scaled_variance.1            844 non-null    float64
 12  scaled_radius_of_gyration    844 non-null    float64
 13  scaled_radius_of_gyration.1  842 non-null    float64
 14  skewness_about               840 non-null    float64
 15  skewness_about.1             845 non-null    float64
 16  skewness_about.2             845 non-null    float64
 17  hollows_ratio                846 non-null    int64  
 18  class                        846 non-null    object 
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
None


 First five rows of data:

    compactness  circularity  distance_circularity  radius_ratio  \
0           95         48.0                  83.0         178.0   
1           91         41.0                  84.0         141.0   
2          104         50.0                 106.0         209.0   
3           93         41.0                  82.0         159.0   
4           85         44.0                  70.0         205.0   

   pr.axis_aspect_ratio  max.length_aspect_ratio  scatter_ratio  \
0                  72.0                       10          162.0   
1                  57.0                        9          149.0   
2                  66.0                       10          207.0   
3                  63.0                        9          144.0   
4                 103.0                       52          149.0   

   elongatedness  pr.axis_rectangularity  max.length_rectangularity  \
0           42.0                    20.0                        159   
1           45.0                    19.0                        143   
2           32.0                    23.0                        158   
3           46.0                    19.0                        143   
4           45.0                    19.0                        144   

   scaled_variance  scaled_variance.1  scaled_radius_of_gyration  \
0            176.0              379.0                      184.0   
1            170.0              330.0                      158.0   
2            223.0              635.0                      220.0   
3            160.0              309.0                      127.0   
4            241.0              325.0                      188.0   

   scaled_radius_of_gyration.1  skewness_about  skewness_about.1  \
0                         70.0             6.0              16.0   
1                         72.0             9.0              14.0   
2                         73.0            14.0               9.0   
3                         63.0             6.0              10.0   
4                        127.0             9.0              11.0   

   skewness_about.2  hollows_ratio class  
0             187.0            197   van  
1             189.0            199   van  
2             188.0            196   car  
3             199.0            207   van  
4             180.0            183   bus  


 Few statistical values of attributes:

        compactness  circularity  distance_circularity  radius_ratio  \
count   846.000000   841.000000            842.000000    840.000000   
mean     93.678487    44.828775             82.110451    168.888095   
std       8.234474     6.152172             15.778292     33.520198   
min      73.000000    33.000000             40.000000    104.000000   
25%      87.000000    40.000000             70.000000    141.000000   
50%      93.000000    44.000000             80.000000    167.000000   
75%     100.000000    49.000000             98.000000    195.000000   
max     119.000000    59.000000            112.000000    333.000000   

       pr.axis_aspect_ratio  max.length_aspect_ratio  scatter_ratio  \
count            844.000000               846.000000     845.000000   
mean              61.678910                 8.567376     168.901775   
std                7.891463                 4.601217      33.214848   
min               47.000000                 2.000000     112.000000   
25%               57.000000                 7.000000     147.000000   
50%               61.000000                 8.000000     157.000000   
75%               65.000000                10.000000     198.000000   
max              138.000000                55.000000     265.000000   

       elongatedness  pr.axis_rectangularity  max.length_rectangularity  \
count     845.000000              843.000000                 846.000000   
mean       40.933728               20.582444                 147.998818   
std         7.816186                2.592933                  14.515652   
min        26.000000               17.000000                 118.000000   
25%        33.000000               19.000000                 137.000000   
50%        43.000000               20.000000                 146.000000   
75%        46.000000               23.000000                 159.000000   
max        61.000000               29.000000                 188.000000   

       scaled_variance  scaled_variance.1  scaled_radius_of_gyration  \
count       843.000000         844.000000                 844.000000   
mean        188.631079         439.494076                 174.709716   
std          31.411004         176.666903                  32.584808   
min         130.000000         184.000000                 109.000000   
25%         167.000000         318.000000                 149.000000   
50%         179.000000         363.500000                 173.500000   
75%         217.000000         587.000000                 198.000000   
max         320.000000        1018.000000                 268.000000   

       scaled_radius_of_gyration.1  skewness_about  skewness_about.1  \
count                   842.000000      840.000000        845.000000   
mean                     72.447743        6.364286         12.602367   
std                       7.486190        4.920649          8.936081   
min                      59.000000        0.000000          0.000000   
25%                      67.000000        2.000000          5.000000   
50%                      71.500000        6.000000         11.000000   
75%                      75.000000        9.000000         19.000000   
max                     135.000000       22.000000         41.000000   

       skewness_about.2  hollows_ratio  
count        845.000000     846.000000  
mean         188.919527     195.632388  
std            6.155809       7.438797  
min          176.000000     181.000000  
25%          184.000000     190.250000  
50%          188.000000     197.000000  
75%          193.000000     201.000000  
max          206.000000     211.000000  

In the above block of code

- We read the data set and try to eye ball the data

- There are 846 rows and 19 columns(attributes)

- We check the target variable and see that there is 429 instances of car, 218 instnaces of bus and 199 instances of van

- We check the data types of all attributes and see that compactness, max.length_aspect_ratio, max.length_rectangularity and hollows_ratio are integer types, class is type object and rest all attributes are float types

- We check head of the data set and see 5 rows just to get the feel

- We also see the statistical values of the attributes and see the data are mostly non decimal and variant from each other only in tens and hundreds place.

In [5]:
En = LabelEncoder() 
columns = data.columns
data['class'] = En.fit_transform(data['class'])
print(data['class'].head())
0    2
1    2
2    1
3    2
4    0
Name: class, dtype: int64

- In this above block of code we use one hot encoding to convert our categorical variable(class) for convenience.

In [6]:
newdata = data.copy()
print(pd.DataFrame(newdata.isnull().sum(), columns= ['Number of missing values']))
                             Number of missing values
compactness                                         0
circularity                                         5
distance_circularity                                4
radius_ratio                                        6
pr.axis_aspect_ratio                                2
max.length_aspect_ratio                             0
scatter_ratio                                       1
elongatedness                                       1
pr.axis_rectangularity                              3
max.length_rectangularity                           0
scaled_variance                                     3
scaled_variance.1                                   2
scaled_radius_of_gyration                           2
scaled_radius_of_gyration.1                         4
skewness_about                                      6
skewness_about.1                                    1
skewness_about.2                                    1
hollows_ratio                                       0
class                                               0

- Here we check missing values and we see a few missing values here and there in some attributes.

In [7]:
X = newdata.iloc[:,0:19] 
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
transformed_values = imputer.fit_transform(X)
column = X.columns
newdata = pd.DataFrame(transformed_values, columns = column) 
print(pd.DataFrame(newdata.isnull().sum(), columns= ['Number of missing values']))                      
                             Number of missing values
compactness                                         0
circularity                                         0
distance_circularity                                0
radius_ratio                                        0
pr.axis_aspect_ratio                                0
max.length_aspect_ratio                             0
scatter_ratio                                       0
elongatedness                                       0
pr.axis_rectangularity                              0
max.length_rectangularity                           0
scaled_variance                                     0
scaled_variance.1                                   0
scaled_radius_of_gyration                           0
scaled_radius_of_gyration.1                         0
skewness_about                                      0
skewness_about.1                                    0
skewness_about.2                                    0
hollows_ratio                                       0
class                                               0

- Here we replace the missing values by the median of the attribute.

In [8]:
plt.figure(figsize= (20,15))
plt.subplot(9,3,1)
sns.boxplot(x= newdata['pr.axis_aspect_ratio'])
plt.subplot(9,3,2)
sns.boxplot(x= newdata.skewness_about)
plt.subplot(9,3,3)
sns.boxplot(x= newdata.scaled_variance)
plt.figure(figsize= (20,15))
plt.subplot(9,3,4)
sns.boxplot(x= newdata['radius_ratio'])
plt.subplot(9,3,5)
sns.boxplot(x= newdata['scaled_radius_of_gyration.1'])
plt.subplot(9,3,6)
sns.boxplot(x= newdata['scaled_variance.1'])
plt.figure(figsize= (20,15))
plt.subplot(9,3,7)
sns.boxplot(x= newdata['max.length_aspect_ratio'])
plt.subplot(9,3,8)
sns.boxplot(x= newdata['skewness_about.1'])
plt.show()
Q1 = newdata.quantile(0.25)
Q3 = newdata.quantile(0.75)
IQR = Q3 - Q1
newdata2 = newdata[~((newdata < (Q1 - 1.5 * IQR)) |(newdata > (Q3 + 1.5 * IQR))).any(axis=1)]
print('\n\n After removal of outliers:\n\n')
plt.figure(figsize= (20,15))
plt.subplot(9,3,1)
sns.boxplot(x= newdata2['pr.axis_aspect_ratio'])
plt.subplot(9,3,2)
sns.boxplot(x= newdata2.skewness_about)
plt.subplot(9,3,3)
sns.boxplot(x= newdata2.scaled_variance)
plt.figure(figsize= (20,15))
plt.subplot(9,3,4)
sns.boxplot(x= newdata2['radius_ratio'])
plt.subplot(9,3,5)
sns.boxplot(x= newdata2['scaled_radius_of_gyration.1'])
plt.subplot(9,3,6)
sns.boxplot(x= newdata2['scaled_variance.1'])
plt.figure(figsize= (20,15))
plt.subplot(9,3,7)
sns.boxplot(x= newdata2['max.length_aspect_ratio'])
plt.subplot(9,3,8)
sns.boxplot(x= newdata2['skewness_about.1'])
plt.show()

 After removal of outliers:


- In the above block of code we see presence of few outliers

- We use the IQR concept to check outliers.

- Since number of outliers were very less we removed it.

In [9]:
newdata2.hist(bins=20, figsize=(60,40))
plt.show()

- In the above block of code we check histograms and distributions.

- Almost all the attributes seem to be normally distributed

- scaled valriance.1, skewness_about.1, skewness_about.2 and scatter_ratio, seem to be right skewed.

In [10]:
skewness = newdata2.skew()
print("skewValue of dataframe attributes: \n", skewness)
skewValue of dataframe attributes: 
 compactness                    0.308063
circularity                    0.270836
distance_circularity           0.128710
radius_ratio                   0.104279
pr.axis_aspect_ratio           0.163894
max.length_aspect_ratio        0.100687
scatter_ratio                  0.592250
elongatedness                  0.040538
pr.axis_rectangularity         0.752291
max.length_rectangularity      0.279915
scaled_variance                0.599998
scaled_variance.1              0.814309
scaled_radius_of_gyration      0.261190
scaled_radius_of_gyration.1    0.522234
skewness_about                 0.613544
skewness_about.1               0.654281
skewness_about.2               0.233775
hollows_ratio                 -0.235264
class                          0.031880
dtype: float64
In [12]:
f, ax = plt.subplots(1, 5, figsize=(30,5))
sns.distplot(newdata2["scaled_variance.1"],bins=10, ax= ax[0])
sns.distplot(newdata2["scaled_variance"],bins=10, ax=ax[1])
sns.distplot(newdata2["skewness_about.1"],bins=10, ax= ax[2])
sns.distplot(newdata2["skewness_about"],bins=10, ax=ax[3])
sns.distplot(newdata2["scatter_ratio"],bins=10, ax=ax[4])
plt.show()

- Here we take a closer look at our skewed attributes.

In [13]:
def correlation_heatmap(dataframe,l,w):
    correlation = dataframe.corr()
    plt.figure(figsize=(l,w))
    sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis')
    plt.show();
    
corr= newdata2.drop('class', axis=1)
correlation_heatmap(corr, 30,15)

- here we check correlation between variables using a heatmap(Pearson's coeff)

These attributes have Strong correlation ~ 0.8 and above

- Scaled Variance & Scaled Variance.1

- skewness_about_2 and hollow_ratio

- ditance_circularity and radius_ratio

- scaled _variance and scaled_radius_of_gyration

- pr.axis_recatngularity and max.length_recatngularity

- scatter_ratio and elongatedness

- elongatedness and pr.axis_rectangularity

These attributes have weak or no correlation

- max_length_aspect_ratio & radius_ratio

- pr.axis_aspect_ratio & max_length_aspect_ratio

- scaled_radius_gyration & scaled_radisu_gyration.1

- scaled_radius_gyration.1 & skewness_about

- skewness_about & skewness_about.1

- skewness_about.1 and skewness_about.2

In [15]:
sns.pairplot(newdata2, diag_kind="kde")
Out[15]:
<seaborn.axisgrid.PairGrid at 0x1a1e9bbb90>

In th above pair plot analysis and adding up to our correlation analysis we see a lot of attrinutes which are highly correlated. So we can decide to remove columns which are highly correlated.There are 8 such columns which can be considered not important:

- max.length_rectangularity

- scaled_radius_of_gyration

- skewness_about.2

- scatter_ratio

- elongatedness

- pr.axis_rectangularity

- scaled_variance

- scaled_variance.1

The rest columns can be considered important, but we will eventually get those using PCA

In [16]:
X = newdata2.iloc[:,0:18].values
y = newdata2.iloc[:,18].values
S = StandardScaler()
X_scaled = S.fit_transform(X)

In the above block of code we normalise/scale our data set

In [18]:
O_X_train,O_X_test,O_y_train,O_y_test = train_test_split(X_scaled,y,test_size=0.30,random_state=1)
print("{0:0.2f}% data is in training set".format((len(O_X_train)/len(data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(O_X_test)/len(data.index)) * 100))
67.26% data is in training set
28.84% data is in test set

In the above block of code we split the data into train:test(70%:30%)

In [19]:
svc = SVC()
svc.fit(O_X_train,O_y_train)
O_y_predict = svc.predict(O_X_test)
print("Model Score On Original Data ",svc.score(O_X_test, O_y_test)*100, '%')
print('\n\nConfusion matrix of Original Data:\n', confusion_matrix(O_y_test, O_y_predict ))
Model Score On Original Data  97.95081967213115 %


Confusion matrix of Original Data:
 [[ 78   0   0]
 [  1 119   1]
 [  0   3  42]]

- In the above block of code we train a support vector machine

- We get an accuracy of 97 .95 on the testing set

- We also see the confusion matrix

In [21]:
kfold = KFold(n_splits=10, random_state=1)
results1 = cross_val_score(svc, X_scaled, y, cv=kfold)
print(results1)
print("Accuracy: %.3f%% (%.3f%%)" % (results1.mean()*100.0, results1.std()*100.0))
[0.93902439 1.         0.96341463 1.         0.96296296 0.97530864
 0.97530864 0.9382716  0.96296296 0.97530864]
Accuracy: 96.926% (1.997%)

- Since a single test set can be misleading, we also perform a k fold cross validation getting an accuracy of 96.26% with a buffer of 1.9 %

In [22]:
cov_matrix = np.cov(X_scaled.T)
print("\n\nCovariance_matrix shape:",cov_matrix.shape)
print("\n\nCovariance_matrix:\n\n",cov_matrix)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('\n\nEigen Vectors:\n\n', eigenvectors)
print('\n Eigen Values: \n\n', eigenvalues)

Covariance_matrix shape: (18, 18)


Covariance_matrix:

 [[ 1.00123153e+00  6.80164027e-01  7.87792814e-01  7.46906930e-01
   2.00881439e-01  4.98273207e-01  8.11840645e-01 -7.89531434e-01
   8.12866245e-01  6.74996601e-01  7.92438680e-01  8.13494150e-01
   5.78399755e-01 -2.53990635e-01  2.00887113e-01  1.61304844e-01
   2.95777412e-01  3.64608943e-01]
 [ 6.80164027e-01  1.00123153e+00  7.87747162e-01  6.41725205e-01
   2.06409699e-01  5.64854067e-01  8.44804611e-01 -8.16768295e-01
   8.41196310e-01  9.62404205e-01  8.03750964e-01  8.33508154e-01
   9.26281607e-01  6.67790806e-02  1.40563881e-01 -1.43598307e-02
  -1.16976151e-01  3.92302597e-02]
 [ 7.87792814e-01  7.87747162e-01  1.00123153e+00  8.09326627e-01
   2.45756551e-01  6.69657073e-01  9.06692225e-01 -9.09806087e-01
   8.95884623e-01  7.69635504e-01  8.85221631e-01  8.89286924e-01
   7.03348558e-01 -2.38231284e-01  9.89345733e-02  2.63832735e-01
   1.29070982e-01  3.22051625e-01]
 [ 7.46906930e-01  6.41725205e-01  8.09326627e-01  1.00123153e+00
   6.67029240e-01  4.61258592e-01  7.90495472e-01 -8.45064567e-01
   7.64769672e-01  5.77501217e-01  7.93778346e-01  7.77097647e-01
   5.51222677e-01 -4.03672885e-01  4.03555670e-02  1.87420711e-01
   4.18869167e-01  5.05314324e-01]
 [ 2.00881439e-01  2.06409699e-01  2.45756551e-01  6.67029240e-01
   1.00123153e+00  1.38431761e-01  2.00217560e-01 -3.02289321e-01
   1.69961019e-01  1.46036511e-01  2.15074904e-01  1.86526180e-01
   1.53697623e-01 -3.25502385e-01 -5.16026240e-02 -2.86185855e-02
   4.06792617e-01  4.20318003e-01]
 [ 4.98273207e-01  5.64854067e-01  6.69657073e-01  4.61258592e-01
   1.38431761e-01  1.00123153e+00  4.98078976e-01 -5.02996017e-01
   4.97845069e-01  6.48642021e-01  4.12068816e-01  4.58456162e-01
   4.04786322e-01 -3.33161873e-01  8.41082601e-02  1.41145578e-01
   5.64852182e-02  3.94934461e-01]
 [ 8.11840645e-01  8.44804611e-01  9.06692225e-01  7.90495472e-01
   2.00217560e-01  4.98078976e-01  1.00123153e+00 -9.73537513e-01
   9.90659730e-01  8.08063766e-01  9.78751548e-01  9.94204811e-01
   7.95893849e-01  2.44702588e-03  6.35490363e-02  2.14445853e-01
  -3.10409338e-03  1.16323654e-01]
 [-7.89531434e-01 -8.16768295e-01 -9.09806087e-01 -8.45064567e-01
  -3.02289321e-01 -5.02996017e-01 -9.73537513e-01  1.00123153e+00
  -9.51112661e-01 -7.70982661e-01 -9.66090990e-01 -9.56973892e-01
  -7.63345981e-01  8.70842667e-02 -4.55135596e-02 -1.84181395e-01
  -1.05393355e-01 -2.11345600e-01]
 [ 8.12866245e-01  8.41196310e-01  8.95884623e-01  7.64769672e-01
   1.69961019e-01  4.97845069e-01  9.90659730e-01 -9.51112661e-01
   1.00123153e+00  8.11346565e-01  9.64981168e-01  9.88989478e-01
   7.93172901e-01  1.77904437e-02  7.28156271e-02  2.16892797e-01
  -2.65026808e-02  9.80719286e-02]
 [ 6.74996601e-01  9.62404205e-01  7.69635504e-01  5.77501217e-01
   1.46036511e-01  6.48642021e-01  8.08063766e-01 -7.70982661e-01
   8.11346565e-01  1.00123153e+00  7.50600479e-01  7.95049173e-01
   8.68007898e-01  5.26495142e-02  1.34795631e-01 -2.44448372e-03
  -1.17812145e-01  6.72596198e-02]
 [ 7.92438680e-01  8.03750964e-01  8.85221631e-01  7.93778346e-01
   2.15074904e-01  4.12068816e-01  9.78751548e-01 -9.66090990e-01
   9.64981168e-01  7.50600479e-01  1.00123153e+00  9.76750881e-01
   7.81984129e-01  1.68621531e-02  3.39888849e-02  2.05971428e-01
   2.28035846e-02  9.60435931e-02]
 [ 8.13494150e-01  8.33508154e-01  8.89286924e-01  7.77097647e-01
   1.86526180e-01  4.58456162e-01  9.94204811e-01 -9.56973892e-01
   9.88989478e-01  7.95049173e-01  9.76750881e-01  1.00123153e+00
   7.90805725e-01  1.62348310e-02  6.49567636e-02  2.03838067e-01
   7.85566308e-05  1.03330899e-01]
 [ 5.78399755e-01  9.26281607e-01  7.03348558e-01  5.51222677e-01
   1.53697623e-01  4.04786322e-01  7.95893849e-01 -7.63345981e-01
   7.93172901e-01  8.68007898e-01  7.81984129e-01  7.90805725e-01
   1.00123153e+00  2.16651698e-01  1.68973862e-01 -5.83635746e-02
  -2.32617810e-01 -1.20727281e-01]
 [-2.53990635e-01  6.67790806e-02 -2.38231284e-01 -4.03672885e-01
  -3.25502385e-01 -3.33161873e-01  2.44702588e-03  8.70842667e-02
   1.77904437e-02  5.26495142e-02  1.68621531e-02  1.62348310e-02
   2.16651698e-01  1.00123153e+00 -5.93373719e-02 -1.31142620e-01
  -8.43627948e-01 -9.18420730e-01]
 [ 2.00887113e-01  1.40563881e-01  9.89345733e-02  4.03555670e-02
  -5.16026240e-02  8.41082601e-02  6.35490363e-02 -4.55135596e-02
   7.28156271e-02  1.34795631e-01  3.39888849e-02  6.49567636e-02
   1.68973862e-01 -5.93373719e-02  1.00123153e+00 -4.53538836e-02
   8.48972195e-02  6.12111362e-02]
 [ 1.61304844e-01 -1.43598307e-02  2.63832735e-01  1.87420711e-01
  -2.86185855e-02  1.41145578e-01  2.14445853e-01 -1.84181395e-01
   2.16892797e-01 -2.44448372e-03  2.05971428e-01  2.03838067e-01
  -5.83635746e-02 -1.31142620e-01 -4.53538836e-02  1.00123153e+00
   7.28908031e-02  2.00156475e-01]
 [ 2.95777412e-01 -1.16976151e-01  1.29070982e-01  4.18869167e-01
   4.06792617e-01  5.64852182e-02 -3.10409338e-03 -1.05393355e-01
  -2.65026808e-02 -1.17812145e-01  2.28035846e-02  7.85566308e-05
  -2.32617810e-01 -8.43627948e-01  8.48972195e-02  7.28908031e-02
   1.00123153e+00  8.91041674e-01]
 [ 3.64608943e-01  3.92302597e-02  3.22051625e-01  5.05314324e-01
   4.20318003e-01  3.94934461e-01  1.16323654e-01 -2.11345600e-01
   9.80719286e-02  6.72596198e-02  9.60435931e-02  1.03330899e-01
  -1.20727281e-01 -9.18420730e-01  6.12111362e-02  2.00156475e-01
   8.91041674e-01  1.00123153e+00]]


Eigen Vectors:

 [[-2.72251046e-01 -8.97284818e-02  2.26045073e-02  1.30419032e-01
  -1.52324139e-01  2.58374578e-01 -1.88794221e-01 -7.71578238e-01
  -3.61784776e-01 -1.25233628e-01  2.92009470e-02  7.62442008e-04
  -1.06680587e-02  1.05983722e-02 -1.01407495e-01 -1.46326861e-01
  -3.81638532e-03  3.32992130e-03]
 [-2.85370045e-01  1.33173937e-01  2.10809943e-01 -2.06785531e-02
   1.39022591e-01 -6.88979940e-02  3.90871235e-01 -6.60528436e-02
  -4.62957583e-02  2.40262612e-01  7.29503235e-02  1.93799916e-01
  -7.74670931e-03 -8.71766559e-02 -3.11337823e-01  1.96463651e-01
  -2.96230720e-01  5.83996136e-01]
 [-3.01486231e-01 -4.40259591e-02 -7.08780817e-02  1.07425217e-01
   8.07335409e-02 -2.04800896e-02 -1.76384547e-01  2.98693883e-01
  -2.64499195e-01 -9.42971834e-02  7.78755026e-01 -2.32649049e-01
   1.11905744e-02  2.28724292e-02  5.89166755e-02  5.33931974e-02
   9.72735293e-02  8.64160083e-02]
 [-2.72594510e-01 -2.04232234e-01 -4.02139629e-02 -2.52957341e-01
  -1.19012554e-01 -1.39449676e-01 -1.56474448e-01  5.20410402e-02
  -1.70430331e-01  8.97062530e-02 -1.31647081e-01  2.75143903e-01
  -3.74689248e-02  2.90668794e-02 -2.04574984e-01  6.58916577e-01
   2.74900989e-01 -2.71300494e-01]
 [-9.85797647e-02 -2.59136858e-01  1.14805227e-01 -6.05228001e-01
  -8.32128223e-02 -5.87145492e-01 -1.02492950e-01 -1.61872497e-01
   1.17212341e-02  2.87528583e-02  4.97534613e-02 -1.45558629e-01
   2.09842091e-02 -9.40948646e-03  1.50893891e-01 -2.89610835e-01
  -1.19100067e-01  9.64017331e-02]
 [-1.94755787e-01 -9.45756320e-02  1.39313484e-01  3.22531411e-01
   6.21376071e-01 -2.65624695e-01 -3.98851794e-01 -5.85800952e-02
   1.73213170e-01 -2.49937617e-01 -1.98444456e-01  1.72600201e-01
  -1.06888298e-02  1.20980507e-02  1.76055013e-01  6.68511988e-02
  -2.92959443e-02  1.10841470e-01]
 [-3.10518442e-01  7.23350799e-02 -1.12924698e-01 -1.00540370e-02
  -8.12405608e-02  8.93335163e-02 -9.14237336e-02  8.45300921e-02
   1.37499298e-01  1.11244025e-01 -1.61642905e-01 -8.22439493e-02
   8.37148260e-01  2.72442207e-01 -1.51805844e-02 -7.66778803e-02
   5.60355480e-02  8.33248999e-02]
 [ 3.08438338e-01 -1.16876769e-02  9.00330455e-02  7.99117560e-02
   7.47379231e-02 -7.25853857e-02  1.04875746e-01 -2.16815347e-01
  -2.59988735e-01  1.24837047e-01 -4.29365477e-03 -3.50089602e-01
   2.42295907e-01  2.61394487e-03  4.61164909e-01  5.23226723e-01
  -2.65096114e-01 -1.36447171e-02]
 [-3.07548493e-01  8.40915278e-02 -1.11063547e-01  1.60464922e-02
  -7.75020996e-02  9.60554272e-02 -9.06723384e-02  3.37069994e-02
   1.03269951e-01  2.11468012e-01 -2.40841717e-01 -3.42527317e-01
  -9.86931593e-02 -6.84892390e-01  2.18872117e-01  2.39504315e-02
   2.70709305e-01  1.72817545e-01]
 [-2.76301073e-01  1.25836631e-01  2.19877688e-01  6.66507863e-02
   2.46140560e-01 -6.35014904e-02  3.49667685e-01 -2.26684736e-01
   2.44776407e-01  3.87473859e-01  2.24580349e-01  3.05154380e-02
  -1.40549391e-02  4.47385929e-02  1.53765067e-01 -1.04419937e-01
   1.53673085e-01 -5.43122947e-01]
 [-3.02748114e-01  7.01998575e-02 -1.44818765e-01 -6.98045095e-02
  -1.49584067e-01  1.34458896e-01 -7.54753072e-02  1.45772665e-01
   5.85239946e-02 -1.47036092e-01  2.06902072e-02  2.33368955e-01
   1.43866319e-02 -2.54510995e-01  1.79499013e-01  1.16604375e-02
  -7.26163025e-01 -3.24937516e-01]
 [-3.07040626e-01  7.79336637e-02 -1.15323952e-01 -1.73631584e-02
  -1.15117310e-01  1.26968672e-01 -6.99641470e-02  5.32611781e-02
   1.28904560e-01  1.60305310e-01 -1.96322990e-01 -2.75169550e-01
  -4.75672122e-01  6.13103868e-01  2.20362642e-01  7.99305617e-02
  -1.22815848e-01  1.42051799e-01]
 [-2.61520489e-01  2.09927277e-01  2.13627435e-01 -7.22457181e-02
   7.54871674e-03 -7.33961842e-02  4.55851958e-01  1.58194670e-01
  -3.37170589e-01 -5.87690102e-01 -2.58436921e-01 -1.07063554e-01
   8.61256926e-03  4.41891377e-02  1.43753708e-01 -5.21969873e-02
   1.69567965e-01 -8.32177228e-02]
 [ 4.36323635e-02  5.03914450e-01 -6.73920886e-02 -1.35860558e-01
  -1.40527774e-01 -1.31928871e-01 -7.90311042e-02 -3.00374428e-01
   5.01365221e-01 -3.87030017e-01  2.27875444e-01 -1.38958435e-01
   7.55464886e-03 -1.59765660e-02 -1.34656976e-01  3.04769192e-01
   5.39469506e-02  3.01217731e-02]
 [-3.67057041e-02 -1.45682524e-02  5.21623444e-01  4.90121679e-01
  -5.89800103e-01 -3.12415086e-01 -1.30187397e-01  1.14687509e-01
   7.50393829e-02  5.41502565e-02 -1.39861362e-02  5.61401152e-03
  -2.19811008e-03 -5.03222786e-03 -1.37166771e-02 -4.76724453e-03
  -3.27151282e-02 -2.14301813e-02]
 [-5.88504115e-02 -9.33980545e-02 -6.87170643e-01  3.80232477e-01
  -1.27793729e-01 -4.82506903e-01  3.10629290e-01 -1.18168951e-01
  -3.07213623e-02 -1.36044539e-02 -1.77010708e-02  8.59021362e-02
  -1.39575997e-02  1.10992435e-02  2.72433694e-02 -2.97178011e-02
   1.82173722e-02  1.83842486e-02]
 [-3.48373860e-02 -5.01664210e-01  6.22069465e-02 -3.55391597e-02
  -1.81582693e-01  2.75222340e-01  2.59557864e-01 -7.27008273e-02
   3.62122453e-01 -2.20343289e-01  1.73696003e-01  2.79657886e-01
   3.82401827e-02  7.76499049e-03  4.14581122e-01  1.14797284e-01
   1.66961820e-01  2.41026732e-01]
 [-8.28136172e-02 -5.06546563e-01  4.08035393e-02  1.03008417e-01
   1.11256244e-01  6.05771535e-02  1.76348774e-01  1.81034286e-02
   2.40710780e-01 -1.71416688e-01 -7.22825606e-02 -5.36171185e-01
   3.98716359e-03 -4.78049584e-02 -4.65683959e-01  8.53480643e-02
  -1.96223612e-01 -1.78387852e-01]]

 Eigen Values: 

 [9.79297570e+00 3.37710644e+00 1.20873054e+00 1.13659560e+00
 8.96286859e-01 6.58293128e-01 3.23056525e-01 2.26906613e-01
 1.12741686e-01 7.62069059e-02 6.18393099e-02 4.42420969e-02
 3.12610726e-03 1.01216098e-02 2.99919142e-02 2.67735138e-02
 1.77191935e-02 1.94537446e-02]

- In the above block of code we calculate the covariance matrix, eigen values and eigen vectors required for PCA

In [23]:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
print('\nEigenvalues in descending order:\n\n', eigvalues_sorted)
[(9.792975698382946, array([-0.27225105, -0.28537005, -0.30148623, -0.27259451, -0.09857976,
       -0.19475579, -0.31051844,  0.30843834, -0.30754849, -0.27630107,
       -0.30274811, -0.30704063, -0.26152049,  0.04363236, -0.0367057 ,
       -0.05885041, -0.03483739, -0.08281362])), (3.377106439893973, array([-0.08972848,  0.13317394, -0.04402596, -0.20423223, -0.25913686,
       -0.09457563,  0.07233508, -0.01168768,  0.08409153,  0.12583663,
        0.07019986,  0.07793366,  0.20992728,  0.50391445, -0.01456825,
       -0.09339805, -0.50166421, -0.50654656])), (1.2087305396350991, array([ 0.02260451,  0.21080994, -0.07087808, -0.04021396,  0.11480523,
        0.13931348, -0.1129247 ,  0.09003305, -0.11106355,  0.21987769,
       -0.14481876, -0.11532395,  0.21362744, -0.06739209,  0.52162344,
       -0.68717064,  0.06220695,  0.04080354])), (1.1365956021766952, array([ 0.13041903, -0.02067855,  0.10742522, -0.25295734, -0.605228  ,
        0.32253141, -0.01005404,  0.07991176,  0.01604649,  0.06665079,
       -0.06980451, -0.01736316, -0.07224572, -0.13586056,  0.49012168,
        0.38023248, -0.03553916,  0.10300842])), (0.8962868592787947, array([-0.15232414,  0.13902259,  0.08073354, -0.11901255, -0.08321282,
        0.62137607, -0.08124056,  0.07473792, -0.0775021 ,  0.24614056,
       -0.14958407, -0.11511731,  0.00754872, -0.14052777, -0.5898001 ,
       -0.12779373, -0.18158269,  0.11125624])), (0.6582931281646526, array([ 0.25837458, -0.06889799, -0.02048009, -0.13944968, -0.58714549,
       -0.26562469,  0.08933352, -0.07258539,  0.09605543, -0.06350149,
        0.1344589 ,  0.12696867, -0.07339618, -0.13192887, -0.31241509,
       -0.4825069 ,  0.27522234,  0.06057715])), (0.32305652510792204, array([-0.18879422,  0.39087124, -0.17638455, -0.15647445, -0.10249295,
       -0.39885179, -0.09142373,  0.10487575, -0.09067234,  0.34966769,
       -0.07547531, -0.06996415,  0.45585196, -0.0790311 , -0.1301874 ,
        0.31062929,  0.25955786,  0.17634877])), (0.226906612823581, array([-0.77157824, -0.06605284,  0.29869388,  0.05204104, -0.1618725 ,
       -0.0585801 ,  0.08453009, -0.21681535,  0.033707  , -0.22668474,
        0.14577266,  0.05326118,  0.15819467, -0.30037443,  0.11468751,
       -0.11816895, -0.07270083,  0.01810343])), (0.11274168632338634, array([-0.36178478, -0.04629576, -0.2644992 , -0.17043033,  0.01172123,
        0.17321317,  0.1374993 , -0.25998873,  0.10326995,  0.24477641,
        0.05852399,  0.12890456, -0.33717059,  0.50136522,  0.07503938,
       -0.03072136,  0.36212245,  0.24071078])), (0.07620690593266875, array([-0.12523363,  0.24026261, -0.09429718,  0.08970625,  0.02875286,
       -0.24993762,  0.11124403,  0.12483705,  0.21146801,  0.38747386,
       -0.14703609,  0.16030531, -0.5876901 , -0.38703002,  0.05415026,
       -0.01360445, -0.22034329, -0.17141669])), (0.06183930986648124, array([ 0.02920095,  0.07295032,  0.77875503, -0.13164708,  0.04975346,
       -0.19844446, -0.16164291, -0.00429365, -0.24084172,  0.22458035,
        0.02069021, -0.19632299, -0.25843692,  0.22787544, -0.01398614,
       -0.01770107,  0.173696  , -0.07228256])), (0.044242096949759564, array([ 0.00076244,  0.19379992, -0.23264905,  0.2751439 , -0.14555863,
        0.1726002 , -0.08224395, -0.3500896 , -0.34252732,  0.03051544,
        0.23336895, -0.27516955, -0.10706355, -0.13895843,  0.00561401,
        0.08590214,  0.27965789, -0.53617119])), (0.029991914206113038, array([-0.10140749, -0.31133782,  0.05891668, -0.20457498,  0.15089389,
        0.17605501, -0.01518058,  0.46116491,  0.21887212,  0.15376507,
        0.17949901,  0.22036264,  0.14375371, -0.13465698, -0.01371668,
        0.02724337,  0.41458112, -0.46568396])), (0.026773513807314873, array([-0.14632686,  0.19646365,  0.0533932 ,  0.65891658, -0.28961083,
        0.0668512 , -0.07667788,  0.52322672,  0.02395043, -0.10441994,
        0.01166044,  0.07993056, -0.05219699,  0.30476919, -0.00476724,
       -0.0297178 ,  0.11479728,  0.08534806])), (0.019453744598141094, array([ 0.00332992,  0.58399614,  0.08641601, -0.27130049,  0.09640173,
        0.11084147,  0.0833249 , -0.01364472,  0.17281754, -0.54312295,
       -0.32493752,  0.1420518 , -0.08321772,  0.03012177, -0.02143018,
        0.01838425,  0.24102673, -0.17838785])), (0.017719193496813595, array([-0.00381639, -0.29623072,  0.09727353,  0.27490099, -0.11910007,
       -0.02929594,  0.05603555, -0.26509611,  0.2707093 ,  0.15367309,
       -0.72616302, -0.12281585,  0.16956796,  0.05394695, -0.03271513,
        0.01821737,  0.16696182, -0.19622361])), (0.010121609778869563, array([ 0.01059837, -0.08717666,  0.02287243,  0.02906688, -0.00940949,
        0.01209805,  0.27244221,  0.00261394, -0.68489239,  0.04473859,
       -0.25451099,  0.61310387,  0.04418914, -0.01597657, -0.00503223,
        0.01109924,  0.00776499, -0.04780496])), (0.0031261072615285062, array([-0.01066806, -0.00774671,  0.01119057, -0.03746892,  0.02098421,
       -0.01068883,  0.83714826,  0.24229591, -0.09869316, -0.01405494,
        0.01438663, -0.47567212,  0.00861257,  0.00755465, -0.00219811,
       -0.0139576 ,  0.03824018,  0.00398716]))]

Eigenvalues in descending order:

 [9.792975698382946, 3.377106439893973, 1.2087305396350991, 1.1365956021766952, 0.8962868592787947, 0.6582931281646526, 0.32305652510792204, 0.226906612823581, 0.11274168632338634, 0.07620690593266875, 0.06183930986648124, 0.044242096949759564, 0.029991914206113038, 0.026773513807314873, 0.019453744598141094, 0.017719193496813595, 0.010121609778869563, 0.0031261072615285062]
In [24]:
summation = sum(eigenvalues)
variance = [(i/summation) for i in sorted(eigenvalues, reverse=True)]  # an array of variance explained by each 
# eigen vector... there will be 18 entries as there are 18 eigen vectors)
cum_variance=np.cumsum(variance)  # an array of cumulative variance. There will be 18 entries with 18 th entry 
# cumulative reaching almost 100%
plt.bar(range(1,19), variance, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_variance, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()

- After getting our required parameters we see from the explained variance graph that only 8 out of 18 attributes explain 95% of the varinace

- So we go ahead with 8 important Principle Components

In [25]:
P_reduce = np.array(eigvectors_sorted[0:8])   
X_std_8D = np.dot(X_scaled,P_reduce.T)   
reduced_pca = pd.DataFrame(X_std_8D) 
reduced_pca
Out[25]:
0 1 2 3 4 5 6 7
0 -0.591125 -0.655523 0.564477 -0.659870 0.855251 -1.835814 0.155983 -0.683144
1 1.524878 -0.327117 0.251528 1.296236 0.282463 -0.091649 -0.209862 0.127745
2 -3.969982 0.239514 1.229875 0.180391 -0.919360 -0.650638 -0.826445 0.163185
3 1.549729 -3.037566 0.466449 0.394413 0.623392 0.383794 -0.131539 -0.176248
4 -5.468963 4.651385 -1.290061 0.023804 -1.692033 2.510965 -0.315330 0.475009
... ... ... ... ... ... ... ... ...
808 0.368201 -0.641878 -1.481101 0.164090 -0.777381 -0.934650 -0.874360 0.193428
809 0.040917 -0.160848 -0.473839 -0.179208 1.978454 -1.431609 0.279248 -0.302916
810 -5.188919 -0.171319 0.585738 -0.886837 1.348744 0.225891 -0.888525 -0.429704
811 3.321748 -1.094132 -1.930953 0.339361 0.527587 -0.030116 0.265542 0.451123
812 5.012853 0.432697 -1.315713 0.196398 0.167606 0.345863 0.409124 -0.221262

813 rows × 8 columns

- Here we get the Principle components as our main features

In [26]:
sns.pairplot(reduced_pca, diag_kind='kde') 
Out[26]:
<seaborn.axisgrid.PairGrid at 0x1a23612b10>

- As we can see in the pairplot there is not correlation between any of our attributes

In [27]:
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,y,test_size=0.30,random_state=1)
print("{0:0.2f}% data is in training set".format((len(pca_X_train)/len(data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(pca_X_test)/len(data.index)) * 100))
67.26% data is in training set
28.84% data is in test set

Above Step 3 but with PCA reducded data (splitting into train:test)

In [28]:
svc1 = SVC()
svc1.fit(pca_X_train,pca_y_train)
pca_y_predict = svc1.predict(pca_X_test)
print("Model Score On reduced Data ",svc1.score(pca_X_test, pca_y_test)*100, '%')
print('\n\nConfusion matrix of Original Data:\n', confusion_matrix(pca_y_test, pca_y_predict ))
Model Score On reduced Data  96.72131147540983 %


Confusion matrix of Original Data:
 [[ 78   0   0]
 [  2 118   1]
 [  1   4  40]]

- Here we train a SVM model with reduced PCA data set and see that accuracy is 96.72 %. This is less than our normal model but not a lot of accuracy is compromised given that we removed 10 attributes

In [29]:
kfold = KFold(n_splits=10, random_state=1)
results2 = cross_val_score(svc1,reduced_pca, y, cv=kfold)
print(results2)
print("Accuracy: %.3f%% (%.3f%%)" % (results2.mean()*100.0, results2.std()*100.0))
[0.92682927 0.98780488 0.96341463 0.96296296 0.95061728 0.97530864
 0.9382716  0.95061728 0.96296296 0.95061728]
Accuracy: 95.694% (1.667%)

- Above is k fold cross validation for PCA reduced data set

In [30]:
print("Classification Report For Raw Data:", "\n", classification_report(O_y_test,O_y_predict))
print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))
print("\nCross Validation Accuracy of Raw data: %.3f%% (%.3f%%)" % (results1.mean()*100.0, results1.std()*100.0))
print("\nCross Validation Accuracy of PCA data: %.3f%% (%.3f%%)" % (results2.mean()*100.0, results2.std()*100.0))
Classification Report For Raw Data: 
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        78
         1.0       0.98      0.98      0.98       121
         2.0       0.98      0.93      0.95        45

    accuracy                           0.98       244
   macro avg       0.98      0.97      0.98       244
weighted avg       0.98      0.98      0.98       244

Classification Report For PCA: 
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        78
         1.0       0.97      0.98      0.97       121
         2.0       0.98      0.89      0.93        45

    accuracy                           0.97       244
   macro avg       0.97      0.95      0.96       244
weighted avg       0.97      0.97      0.97       244


Cross Validation Accuracy of Raw data: 96.926% (1.997%)

Cross Validation Accuracy of PCA data: 95.694% (1.667%)

- Here we generate reports for Raw trained data set and PCA reduced data set

- We see that Principle component analysis can do dimensionality reduction and choose only the very important Attributes by analysing the relationship between independent attributes to see which will be relevant in building model.

- Although we compromise very little with accuracy, PCA model is pure(as in no relation between independent varibles) and another huge advantage is we will be increasing processing speed and implementation would be easier